knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file())
library(ProjectTemplate)
load.project()
This notebook includes part of the preprocessing and data cleaning necessary for creating a model frame to pass to a function call which will execute the model fitting, including imputing any missing values using information contained in other columns for a given record.
We can start by removing any columns which do not exhibit any variability (which have no utility with respect to fitting a model.)
load(file.path("cache", "loan.RData"))
loan <- loan[, -match(names(which(sapply(loan, function(x) length(unique(x))) ==
1)), names(loan))]
To make use of the date columns, we can create a variable capturing the length of time between the event date and the date that the loan was issued. The credit_tenure_years variable captures the length of time that the loan applicant has been a customer with the lending company, and weeks_btwn_credit_pull_loan_issue captures the amount of time between the most recent credit pull of the loan applicant and the date that the loan was issued. Negative values indicate that the last credit pull was after the loan was issued, and positive values indicate that the most recent credit pull was prior to loan origination.
loan <- loan %>% mutate(cred_tenure_years = as.numeric(issue_d - earliest_cr_line)/365.5,
weeks_btwn_credit_pull_loan_issue = as.numeric(issue_d - last_credit_pull_d)/7) %>%
dplyr::select(-c(last_credit_pull_d, issue_d, earliest_cr_line))
The term column has whitespace on the left side of the entries, so removing any whitespace from all character variables.
loan <- loan %>% mutate_if(is.character, funs(str_trim(., side = "both")))
Let’s take a look at the distribution of the categorical variables:
short_char_columns <- sapply(loan, is.character) %>% which %>% names %>% sapply(.,
function(x) {
length(unique(as.vector(data.frame(loan)[, x])))
}) %>% magrittr::is_less_than(40) %>% which %>% names
short_char_columns
## [1] "application_type" "emp_length" "grade"
## [4] "home_ownership" "initial_list_status" "loan_status"
## [7] "purpose" "sub_grade" "term"
## [10] "title" "verification_status"
grid.arrange(loan %>% dplyr::select(loan_status, purpose, verification_status) %>%
gather(key = column, value = value) %>% ggplot(., aes(x = value)) + geom_bar() +
theme_bw() + facet_wrap(~column, nrow = 1, scales = "free") + theme(axis.text.x = element_text(angle = 35,
hjust = 1, vjust = 1, size = 6)) + xlab(""), loan %>% dplyr::select(initial_list_status,
grade, sub_grade) %>% gather(key = column, value = value) %>% ggplot(.,
aes(x = value)) + geom_bar() + theme_bw() + theme(axis.text.x = element_text(angle = 35,
hjust = 1, vjust = 1, size = 5)) + facet_wrap(~column, nrow = 1, scales = "free") +
xlab(""), loan %>% dplyr::select(application_type, home_ownership, term,
emp_length) %>% gather(key = column, value = value) %>% ggplot(., aes(x = value)) +
geom_bar() + theme_bw() + facet_wrap(~column, nrow = 1, scales = "free") +
theme(axis.text.x = element_text(angle = 35, hjust = 1, vjust = 1)) + xlab(""),
loan %>% ggplot(., aes(x = addr_state)) + geom_bar() + theme_bw() + xlab("") +
theme(axis.text.x = element_text(angle = 35, hjust = 1, vjust = 1)),
ncol = 1, nrow = 4)
There are a few categorical variables which are notably unbalanced - application type, status, purpose, and home ownership. How is the “ANY” home ownership category defined? Given more information, depending on the definition, we may want to remove this category.
If we take a closer look at loan title, though there are a reasonable number of factor levels, there are still some very small categories.
loan %>% group_by(title, grade) %>% dplyr::count() %>% ungroup %>% group_by(title) %>%
mutate(p = n/sum(n), N = sum(n)) %>% ungroup %>% ggplot(., aes(x = title,
y = p)) + geom_bar(aes(color = grade, fill = grade), stat = "identity") +
theme_bw() + scale_color_tableau() + scale_fill_tableau() + theme(axis.text.x = element_text(angle = 45,
hjust = 1, vjust = 1), panel.grid = element_blank()) + geom_text(aes(x = title,
y = 1.01, label = paste0("N = ", N)), angle = 90, size = 2, hjust = 0, vjust = 0.5) +
scale_y_continuous(breaks = c(0, 1), limits = c(0, 1.17)) + ylab("")
These can be aggregated and cleaned up:
title <- loan$title
title[title == "Prescription Drug and Medical Costs"] <- "Medical expenses"
title[title == "new kitchen for momma!"] <- "Home improvement"
title[title %in% c("DebtC", "Credit Card/Auto Repair", "Paying off higher interest cards & auto")] <- "Debt consolidation"
title[title %in% c("Pay off Lowes Card", "New Baby and New House (CC Consolidate)")] <- "Credit card refinancing"
title[title %in% c("Trying to come back to reality!", "Student Loan", "Simple Loan Until Contract Is Completed",
"SAVE", "odymeds", "considerate", "new day", "Learning and training", "Green loan")] <- "Other"
title[is.na(title)] <- "Other"
loan$title <- NULL
loan <- loan %>% mutate(title = title)
rm(title)
loan %>% group_by(title, grade) %>% dplyr::count() %>% ungroup %>% group_by(title) %>%
mutate(p = n/sum(n), N = sum(n)) %>% ungroup %>% ggplot(., aes(x = title,
y = p)) + geom_bar(aes(color = grade, fill = grade), stat = "identity") +
theme_bw() + scale_color_tableau() + scale_fill_tableau() + theme(axis.text.x = element_text(angle = 45,
hjust = 1, vjust = 1), panel.grid = element_blank()) + geom_text(aes(x = title,
y = 1.01, label = paste0("N = ", N)), angle = 90, size = 2.7, hjust = 0,
vjust = 0.5) + scale_y_continuous(breaks = c(0, 1), limits = c(0, 1.17)) +
ylab("")
We see the same issues with loan purpose factor levels, so I have aggregated very small categories again.
loan %>% group_by(purpose, grade) %>% dplyr::count() %>% ungroup %>% group_by(purpose) %>%
mutate(p = n/sum(n), N = sum(n)) %>% ungroup %>% ggplot(., aes(purpose,
p, fill = grade, color = grade)) + geom_bar(stat = "identity") + theme_bw() +
scale_color_tableau() + scale_fill_tableau() + theme(axis.text.x = element_text(angle = 45,
hjust = 1, vjust = 1)) + geom_text(aes(x = purpose, y = 1.01, label = paste0("N = ",
N)), angle = 90, size = 2.7, hjust = 0, vjust = 0.5, inherit.aes = FALSE) +
scale_y_continuous(breaks = c(0, 1), limits = c(0, 1.17))
purpose <- loan$purpose
purpose[purpose %in% c("wedding", "educational")] <- "other"
loan$purpose <- NULL
loan <- loan %>% mutate(purpose = purpose)
rm(purpose)
It is possible that we need to predict the grade of a loan having a term length different than 36 and 60 months, so we’ll treat loan term as a numeric variable.
loan <- loan %>% mutate(term_length_months = case_when(term == "36 months" ~
36, term == "60 months" ~ 60)) %>% dplyr::select(-term)
3 of the categorical variables have more than 50 levels (so I didn’t include bar graphs displaying these distributions.)
long_char_columns <- sapply(loan, is.character) %>% which %>% names %>% sapply(.,
function(x) {
length(unique(as.vector(data.frame(loan)[, x])))
}) %>% magrittr::is_greater_than(50) %>% which %>% names
sapply(loan[, long_char_columns], function(x) {
length(unique(x))
})
## emp_title url zip_code
## 114475 421095 914
Each loan has its own distinct url, so it isn’t useful for modeling. Information contained in emp_title may be useful for predicting loan grade, but the entries of that column are very unstructured with many levels having only a single observation. and need cleaned up before adding it as a covariate.
Out of the total 114475 unique observed values of emp_title, 91607 of those values are associated with a single loan application. Many of these should be labelled according to the same category; for example, of the unique observed values, 13391 contain the word ‘manager’. If more time were available, one might investigate using word embedding techniques to make use of this column, but for now, I’ll leave it out of the model.
loan <- loan %>% dplyr::select(-emp_title)
There are 914 unique values of zip_code in the dataset. Effects of zip codes with only a few loans may not even be estimable if enough of them aren’t included in the training data, and we won’t be able to make predictions for loans with zip codes not present in the training data.
loan %>% group_by(zip_code) %>% dplyr::count() %>% ungroup %>% arrange(n)
## # A tibble: 914 x 2
## zip_code n
## <chr> <int>
## 1 007xx 1
## 2 203xx 1
## 3 205xx 1
## 4 269xx 1
## 5 340xx 1
## 6 343xx 1
## 7 348xx 1
## 8 399xx 1
## 9 502xx 1
## 10 503xx 1
## # ... with 904 more rows
To reduce dimensionality and avoid overfitting, we need to aggregate levels of zip_code having small numbers of observations by combining them into a single category. The value for addr_state will capture all of the variability in loan grade attributable to geography for states having only a single value of zip_code, so these zip codes can be aggregated to a single category with no information loss at all as long as state is included in the model. I’ll create a category for zip codes having fewer than 100 loans associated with them. Why 100? While somewhat arbitrary, 200 observations should be adequate for estimating the probability of loan grade for each of A, B, C, D, E, F, and G. Alternatively, you could view the number of levels of a factor considered for splitting trees in the construction of the random forest as a hyperparameter. The number of levels considered then becomes part of the model selection problem.
zip_counts <- loan %>% group_by(zip_code) %>% dplyr::count() %>% ungroup %>%
filter(n >= 200)
zip_code <- loan$zip_code
zip_code[!(zip_code %in% zip_counts$zip_code)] <- "other"
loan$zip_code <- NULL
loan <- loan %>% mutate(zip_code = zip_code)
The bulk of the janitorial work with the numeric variables in the dataset is comprised of dealing with missing values using some old fashioned common sense if possible, eliminating uninformative or highly correlated variables, and addressing any detectable outliers.
kable(loan %>% mutate_if(is.numeric, is.na) %>% summarize_if(is.logical, sum) %>%
gather(key = variable, value = NA_count) %>% arrange(desc(NA_count)))
| variable | NA_count |
|---|---|
| dti_joint | 420586 |
| annual_inc_joint | 420584 |
| il_util | 402478 |
| mths_since_rcnt_il | 400285 |
| all_util | 399723 |
| inq_fi | 399723 |
| inq_last_12m | 399723 |
| max_bal_bc | 399723 |
| open_acc_6m | 399723 |
| open_il_12m | 399723 |
| open_il_24m | 399723 |
| open_rv_12m | 399723 |
| open_rv_24m | 399723 |
| total_bal_il | 399723 |
| total_cu_tl | 399723 |
| mths_since_last_record | 346680 |
| mths_since_recent_bc_dlq | 312495 |
| mths_since_last_major_derog | 298366 |
| mths_since_recent_revol_delinq | 269358 |
| mths_since_last_delinq | 203962 |
| mths_since_recent_inq | 44599 |
| num_tl_120dpd_2m | 19230 |
| mo_sin_old_il_acct | 12254 |
| percent_bc_gt_75 | 4239 |
| bc_util | 4227 |
| bc_open_to_buy | 3963 |
| mths_since_recent_bc | 3798 |
| revol_util | 162 |
| weeks_btwn_credit_pull_loan_issue | 9 |
| dti | 2 |
| num_rev_accts | 1 |
| acc_now_delinq | 0 |
| acc_open_past_24mths | 0 |
| annual_inc | 0 |
| avg_cur_bal | 0 |
| chargeoff_within_12_mths | 0 |
| collection_recovery_fee | 0 |
| collections_12_mths_ex_med | 0 |
| delinq_2yrs | 0 |
| delinq_amnt | 0 |
| fico_range_high | 0 |
| fico_range_low | 0 |
| funded_amnt | 0 |
| funded_amnt_inv | 0 |
| id | 0 |
| inq_last_6mths | 0 |
| installment | 0 |
| last_fico_range_high | 0 |
| last_fico_range_low | 0 |
| loan_amnt | 0 |
| mort_acc | 0 |
| mo_sin_old_rev_tl_op | 0 |
| mo_sin_rcnt_rev_tl_op | 0 |
| mo_sin_rcnt_tl | 0 |
| num_accts_ever_120_pd | 0 |
| num_actv_bc_tl | 0 |
| num_actv_rev_tl | 0 |
| num_bc_sats | 0 |
| num_bc_tl | 0 |
| num_il_tl | 0 |
| num_op_rev_tl | 0 |
| num_rev_tl_bal_gt_0 | 0 |
| num_sats | 0 |
| num_tl_30dpd | 0 |
| num_tl_90g_dpd_24m | 0 |
| num_tl_op_past_12m | 0 |
| open_acc | 0 |
| out_prncp | 0 |
| out_prncp_inv | 0 |
| pct_tl_nvr_dlq | 0 |
| pub_rec | 0 |
| pub_rec_bankruptcies | 0 |
| recoveries | 0 |
| revol_bal | 0 |
| tax_liens | 0 |
| total_acc | 0 |
| total_bal_ex_mort | 0 |
| total_bc_limit | 0 |
| total_il_high_credit_limit | 0 |
| total_pymnt | 0 |
| total_pymnt_inv | 0 |
| total_rev_hi_lim | 0 |
| tot_coll_amt | 0 |
| tot_cur_bal | 0 |
| tot_hi_cred_lim | 0 |
| cred_tenure_years | 0 |
| term_length_months | 0 |
I examined plots of the univariate distributions for degenerate variables and outliers; the column mininum, median, and maximum are indicated by the dotted vertical lines:
gglist <- list()
for (this_column in names(which(sapply(loan[, -match("id", names(loan))], is.numeric)))) {
if (this_column != c("policy_code", "term_length_months")) {
gglist <- list.append(gglist, ggplot(loan, aes_string(x = this_column)) +
geom_histogram() + theme_bw() + theme(axis.text.x = element_text(angle = 45,
hjust = 1, vjust = 1)) + geom_vline(aes(xintercept = min(as.vector(data.frame(loan)[,
this_column]), na.rm = T)), alpha = 0.5, linetype = 2) + geom_vline(aes(xintercept = max(as.vector(data.frame(loan)[,
this_column]), na.rm = T)), alpha = 0.5, linetype = 2) + geom_vline(aes(xintercept = median(as.vector(data.frame(loan)[,
this_column]), na.rm = T)), alpha = 0.5, linetype = 2))
}
}
marrangeGrob(gglist[1:15], nrow = 5, ncol = 3)
marrangeGrob(gglist[16:30], nrow = 5, ncol = 3)
marrangeGrob(gglist[31:45], nrow = 5, ncol = 3)
marrangeGrob(gglist[46:length(gglist)], nrow = 3, ncol = 3)
The histogram for weeks_btwn_credit_pull_loan_issue or all loans, the most recent credit pull took place after the loan originated. Does this date also correspond to when the loan grade was assigned? If the loan grade was assigned prior to the most recent credit pull, then this variable along with other credit variables collected at the same time should not be used to predict the loan grade. This is something I would verify before proceeding with model fitting.
Additionally, there are some numeric variables with notable high outliers (tax_liens - max of 85, pub_rec - max of 86):
ggplot(loan, aes(tax_liens, pub_rec)) + geom_jitter(aes(color = grade)) + theme_bw() +
scale_color_tableau() + theme(legend.position = "bottom") + guides(color = guide_legend(nrow = 1))
The two observations which are the largest outliers with respect to tax_liens correspond to the same observations which are the largest outliers with respect to pub_rec. The plot below of number of public records versus (number of tax liens + number of publicly recorded bankruptcies) indicates that the types of records that comprise the pub_rec count include both tax_liens count and pub_rec_bankruptcies count. What other types of recorded events contribute to this count? From a modeling standpoint, it is preferential to include the marginal counts as covariates rather than an aggregated count.
ggplot(loan, aes(tax_liens + pub_rec_bankruptcies, pub_rec)) + geom_jitter(aes(color = grade)) +
theme_bw() + scale_color_tableau() + theme(legend.position = "bottom") +
guides(color = guide_legend(nrow = 1))
The “joint” variables are missing for non-joint applications, so in order to use them, we need to impute the most sensical value. For individual applications, we’ll set the joint income and joint dti to the individual income and dti of the applicant. It is likely that, since there are so few joint applications, joint dti and dti will be highly correlated, as will annual income and joint annual income, but this information may be useful for predicting the grade for the small number of loans with joint applications.
loan <- loan %>% mutate(annual_income_joint = case_when(application_type ==
"Joint App" ~ annual_inc_joint, application_type == "Individual" ~ annual_inc),
dti_joint = case_when(application_type == "Joint App" ~ dti_joint, application_type ==
"Individual" ~ dti)) %>% dplyr::select(-annual_inc_joint)
The last_fico_range_low, last_fico_range_high, fico_range_low, fico_range_high variables are two numeric columns constructed to designate into which of 38 buckets the applicant’s fico score falls.
loan %>% group_by(fico_range_low, fico_range_high) %>% dplyr::count() %>% ungroup %>%
arrange(fico_range_low, fico_range_high) %>% mutate(fico_range = fico_range_high -
fico_range_low) %>% print(n = Inf)
## # A tibble: 38 x 4
## fico_range_low fico_range_high n fico_range
## <int> <int> <int> <int>
## 1 660 664 41324 4
## 2 665 669 39245 4
## 3 670 674 39315 4
## 4 675 679 34479 4
## 5 680 684 33491 4
## 6 685 689 28730 4
## 7 690 694 27897 4
## 8 695 699 24869 4
## 9 700 704 22558 4
## 10 705 709 20530 4
## 11 710 714 17645 4
## 12 715 719 15567 4
## 13 720 724 13002 4
## 14 725 729 10151 4
## 15 730 734 8706 4
## 16 735 739 6709 4
## 17 740 744 5793 4
## 18 745 749 4619 4
## 19 750 754 4100 4
## 20 755 759 3482 4
## 21 760 764 2780 4
## 22 765 769 2610 4
## 23 770 774 2113 4
## 24 775 779 2065 4
## 25 780 784 1781 4
## 26 785 789 1455 4
## 27 790 794 1247 4
## 28 795 799 1065 4
## 29 800 804 917 4
## 30 805 809 767 4
## 31 810 814 611 4
## 32 815 819 474 4
## 33 820 824 359 4
## 34 825 829 264 4
## 35 830 834 160 4
## 36 835 839 98 4
## 37 840 844 62 4
## 38 845 850 55 5
loan %>% group_by(last_fico_range_low, last_fico_range_high) %>% dplyr::count() %>%
ungroup %>% arrange(last_fico_range_low, last_fico_range_high) %>% mutate(last_fico_range = last_fico_range_high -
last_fico_range_low) %>% print(n = Inf)
## # A tibble: 72 x 4
## last_fico_range_low last_fico_range_high n last_fico_range
## <int> <int> <int> <int>
## 1 0 0 59 0
## 2 0 499 8629 499
## 3 500 504 1944 4
## 4 505 509 2074 4
## 5 510 514 2424 4
## 6 515 519 2599 4
## 7 520 524 2883 4
## 8 525 529 2963 4
## 9 530 534 3403 4
## 10 535 539 3418 4
## 11 540 544 3768 4
## 12 545 549 3620 4
## 13 550 554 3960 4
## 14 555 559 3875 4
## 15 560 564 3943 4
## 16 565 569 3688 4
## 17 570 574 3893 4
## 18 575 579 3558 4
## 19 580 584 3824 4
## 20 585 589 3437 4
## 21 590 594 3688 4
## 22 595 599 3527 4
## 23 600 604 3599 4
## 24 605 609 3537 4
## 25 610 614 3697 4
## 26 615 619 3596 4
## 27 620 624 4040 4
## 28 625 629 4004 4
## 29 630 634 4714 4
## 30 635 639 4789 4
## 31 640 644 5543 4
## 32 645 649 5962 4
## 33 650 654 6831 4
## 34 655 659 7944 4
## 35 660 664 8810 4
## 36 665 669 9930 4
## 37 670 674 11754 4
## 38 675 679 12207 4
## 39 680 684 13864 4
## 40 685 689 13773 4
## 41 690 694 15151 4
## 42 695 699 15068 4
## 43 700 704 14805 4
## 44 705 709 14896 4
## 45 710 714 14166 4
## 46 715 719 13908 4
## 47 720 724 13423 4
## 48 725 729 11715 4
## 49 730 734 11526 4
## 50 735 739 9701 4
## 51 740 744 9336 4
## 52 745 749 7919 4
## 53 750 754 7596 4
## 54 755 759 6874 4
## 55 760 764 5789 4
## 56 765 769 5944 4
## 57 770 774 5115 4
## 58 775 779 5030 4
## 59 780 784 4583 4
## 60 785 789 4072 4
## 61 790 794 3931 4
## 62 795 799 3296 4
## 63 800 804 3201 4
## 64 805 809 2699 4
## 65 810 814 2221 4
## 66 815 819 1820 4
## 67 820 824 1301 4
## 68 825 829 1038 4
## 69 830 834 613 4
## 70 835 839 342 4
## 71 840 844 174 4
## 72 845 850 71 5
loan <- loan %>% dplyr::select(-c(last_fico_range_low, fico_range_low))
Only the high end of the range is necessary to determine which bucket the score belongs to, so I’ll eliminate the range lows from the model frame.
The open_rv_12m, open_rv_24m, open_il_12m, and open_il_24m entries for certain records can be imputed using the number of open accounts of the appropriate type and the number of months since the most recent and oldest account was opened.
open_rv_12m <- loan$open_rv_12m
open_rv_24m <- loan$open_rv_24m
open_il_12m <- loan$open_il_12m
open_il_24m <- loan$open_il_24m
open_rv_12m[loan$mo_sin_rcnt_rev_tl_op > 12 & is.na(open_rv_12m)] <- 0
open_rv_24m[loan$mo_sin_rcnt_rev_tl_op > 24 & is.na(open_rv_24m)] <- 0
open_il_12m[loan$mths_since_rcnt_il > 12 & is.na(open_il_12m)] <- 0
open_il_24m[loan$mths_since_rcnt_il > 24 & is.na(open_il_24m)] <- 0
loan$open_rv_12m <- loan$open_rv_24m <- loan$open_il_24m <- loan$open_il_12m <- NULL
loan <- loan %>% mutate(open_rv_12m = open_rv_12m, open_rv_24m = open_rv_24m,
open_il_12m = open_il_12m, open_il_24m = open_il_24m)
rm(open_il_12m)
rm(open_il_24m)
rm(open_rv_12m)
rm(open_rv_24m)
Accounts for which pct_tl_nvr_dlq == 100 do not have any delinquencies associated with them, so their corresponding entries in the mths_since_last_delinq and mths_since_recent_revol_delinq columns are missing. I’ll impute these with a negative integer to differentiate these accounts from those which have had a delinquency event.
loan$mths_since_last_delinq[loan$pct_tl_nvr_dlq == 100] <- -100
loan$mths_since_recent_revol_delinq[loan$pct_tl_nvr_dlq == 100] <- -100
There are other columns with missing values which might be reasonable to impute using common sense and a bit of subject matter knowledge. Further analysis should include investigating how to impute missing values using information in existing columns.
Next, I eliminate any covariates which can be exactly expressed as linear combinations of other variables:
complete_numeric_cols <- names(which(colSums(is.na(loan[, names(which(sapply(loan,
is.numeric)))])) == 0))
complete_numeric_cols <- complete_numeric_cols[complete_numeric_cols != "id"]
linear_combos <- caret::findLinearCombos(as.matrix(loan[, complete_numeric_cols]))
linear_combos$linearCombos %>% lapply(., function(x) {
names(which(sapply(loan, is.numeric)))[x]
})
## [[1]]
## [1] "funded_amnt" "delinq_2yrs"
loan <- loan %>% dplyr::select(-c(funded_amnt, delinq_2yrs))
and for further dimension reduction, we can eliminate covariates which are highly correlated with other covariates:
correlationMatrix <- cor(loan[, names(which(sapply(loan, is.numeric)))], use = "pairwise.complete.obs")
melted_corr <- melt(correlationMatrix)
names(melted_corr) <- c("var1", "var2", "R")
melted_corr %>% ggplot(data = ., aes(var1, var2, fill = R)) + geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
limit = c(-1, 1), space = "Lab", name = "Pearson\nCorrelation") + theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1)) +
coord_fixed() + xlab("") + ylab("") + theme(axis.text.x = element_text(size = 7))
kable(melted_corr %>% filter(R > 0.85 & var1 != var2))
| var1 | var2 | R |
|---|---|---|
| annual_income_joint | annual_inc | 0.9995550 |
| recoveries | collection_recovery_fee | 0.9910568 |
| dti_joint | dti | 0.9744357 |
| dti | dti_joint | 0.9744357 |
| installment | funded_amnt_inv | 0.9411558 |
| loan_amnt | funded_amnt_inv | 0.9999945 |
| total_pymnt | funded_amnt_inv | 0.8561687 |
| total_pymnt_inv | funded_amnt_inv | 0.8561567 |
| funded_amnt_inv | installment | 0.9411558 |
| loan_amnt | installment | 0.9411739 |
| total_pymnt | installment | 0.8533709 |
| total_pymnt_inv | installment | 0.8533395 |
| funded_amnt_inv | loan_amnt | 0.9999945 |
| installment | loan_amnt | 0.9411739 |
| total_pymnt | loan_amnt | 0.8561142 |
| total_pymnt_inv | loan_amnt | 0.8560929 |
| cred_tenure_years | mo_sin_old_rev_tl_op | 0.9211809 |
| mths_since_recent_revol_delinq | mths_since_last_delinq | 0.9938262 |
| mths_since_last_delinq | mths_since_recent_revol_delinq | 0.9938262 |
| num_rev_tl_bal_gt_0 | num_actv_rev_tl | 0.9808494 |
| num_actv_rev_tl | num_rev_tl_bal_gt_0 | 0.9808494 |
| open_acc | num_sats | 0.9986025 |
| num_sats | open_acc | 0.9986025 |
| out_prncp_inv | out_prncp | 0.9999981 |
| out_prncp | out_prncp_inv | 0.9999981 |
| collection_recovery_fee | recoveries | 0.9910568 |
| total_bal_il | total_bal_ex_mort | 0.9042258 |
| total_il_high_credit_limit | total_bal_ex_mort | 0.8645172 |
| total_bal_ex_mort | total_bal_il | 0.9042258 |
| total_il_high_credit_limit | total_bal_il | 0.9586317 |
| total_bal_ex_mort | total_il_high_credit_limit | 0.8645172 |
| total_bal_il | total_il_high_credit_limit | 0.9586317 |
| funded_amnt_inv | total_pymnt | 0.8561687 |
| installment | total_pymnt | 0.8533709 |
| loan_amnt | total_pymnt | 0.8561142 |
| total_pymnt_inv | total_pymnt | 0.9999957 |
| funded_amnt_inv | total_pymnt_inv | 0.8561567 |
| installment | total_pymnt_inv | 0.8533395 |
| loan_amnt | total_pymnt_inv | 0.8560929 |
| total_pymnt | total_pymnt_inv | 0.9999957 |
| tot_hi_cred_lim | tot_cur_bal | 0.9749433 |
| tot_cur_bal | tot_hi_cred_lim | 0.9749433 |
| mo_sin_old_rev_tl_op | cred_tenure_years | 0.9211809 |
| annual_inc | annual_income_joint | 0.9995550 |
| open_rv_24m | open_rv_12m | 0.8619908 |
| open_rv_12m | open_rv_24m | 0.8619908 |
Note: We could spend much more time with exploratory analysis looking thoroughly at both the bivariate relationships between predictors as well as the bivariate relationships between predictors and loan grade, which could inform both variable selection as well as the form of the relationship between predictors and response. I am forgoing an exhaustive analysis for not only the sake of brevity, but because I am choosing to fit a random forest classifier. While the model fitting procedure for random forests does not invoke any automatic variable selection (fitting procedures for certain classes of models can, such as regularized regression-based methods using lasso penalty or variant thereof), these models are extremely flexible and capable of capturing complex, nonlinear relationships between predictors and the response and interaction effects between predictors. Variable importance measures obtained from fitted models can provide more informative means of variable selection. If variable importance measures indicate that a variable is not informative with respect to accurately predicting the response, then the model may be refit and these variables can be excluded.
loan_mf <- loan %>% dplyr::select(-c(total_pymnt_inv, out_prncp_inv, funded_amnt_inv,
total_pymnt, installment, avg_cur_bal, id, annual_income_joint)) %>% mutate_if(is.character,
funs(factor))
cache("loan_mf")